; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck  %s
; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s

define void @transpose_multiply(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr, <9 x double>* %C.Ptr) {
; CHECK-LABEL: @transpose_multiply(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <9 x double>* [[A_PTR:%.*]] to double*
; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast double* [[TMP0]] to <3 x double>*
; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST]], align 8
; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP0]], i64 3
; CHECK-NEXT:    [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <3 x double>*
; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST1]], align 8
; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, double* [[TMP0]], i64 6
; CHECK-NEXT:    [[VEC_CAST4:%.*]] = bitcast double* [[VEC_GEP3]] to <3 x double>*
; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST4]], align 8
; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <9 x double>* [[B_PTR:%.*]] to double*
; CHECK-NEXT:    [[VEC_CAST6:%.*]] = bitcast double* [[TMP1]] to <3 x double>*
; CHECK-NEXT:    [[COL_LOAD7:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST6]], align 8
; CHECK-NEXT:    [[VEC_GEP8:%.*]] = getelementptr double, double* [[TMP1]], i64 3
; CHECK-NEXT:    [[VEC_CAST9:%.*]] = bitcast double* [[VEC_GEP8]] to <3 x double>*
; CHECK-NEXT:    [[COL_LOAD10:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST9]], align 8
; CHECK-NEXT:    [[VEC_GEP11:%.*]] = getelementptr double, double* [[TMP1]], i64 6
; CHECK-NEXT:    [[VEC_CAST12:%.*]] = bitcast double* [[VEC_GEP11]] to <3 x double>*
; CHECK-NEXT:    [[COL_LOAD13:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST12]], align 8
; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x double> [[COL_LOAD]], i64 0
; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x double> undef, double [[TMP2]], i64 0
; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <3 x double> [[COL_LOAD2]], i64 0
; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <3 x double> [[TMP3]], double [[TMP4]], i64 1
; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <3 x double> [[COL_LOAD5]], i64 0
; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <3 x double> [[TMP5]], double [[TMP6]], i64 2
; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <3 x double> [[COL_LOAD]], i64 1
; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <3 x double> undef, double [[TMP8]], i64 0
; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <3 x double> [[COL_LOAD2]], i64 1
; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <3 x double> [[TMP9]], double [[TMP10]], i64 1
; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <3 x double> [[COL_LOAD5]], i64 1
; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <3 x double> [[TMP11]], double [[TMP12]], i64 2
; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <3 x double> [[COL_LOAD]], i64 2
; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <3 x double> undef, double [[TMP14]], i64 0
; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <3 x double> [[COL_LOAD2]], i64 2
; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <3 x double> [[TMP15]], double [[TMP16]], i64 1
; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <3 x double> [[COL_LOAD5]], i64 2
; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <3 x double> [[TMP17]], double [[TMP18]], i64 2
; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 0
; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP20]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP21:%.*]] = fmul <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
; CHECK-NEXT:    [[BLOCK14:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 1
; CHECK-NEXT:    [[SPLAT_SPLATINSERT15:%.*]] = insertelement <1 x double> poison, double [[TMP22]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT16:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT15]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP23:%.*]] = fmul <1 x double> [[BLOCK14]], [[SPLAT_SPLAT16]]
; CHECK-NEXT:    [[TMP24:%.*]] = fadd <1 x double> [[TMP21]], [[TMP23]]
; CHECK-NEXT:    [[BLOCK17:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 2
; CHECK-NEXT:    [[SPLAT_SPLATINSERT18:%.*]] = insertelement <1 x double> poison, double [[TMP25]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT19:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT18]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP26:%.*]] = fmul <1 x double> [[BLOCK17]], [[SPLAT_SPLAT19]]
; CHECK-NEXT:    [[TMP27:%.*]] = fadd <1 x double> [[TMP24]], [[TMP26]]
; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <1 x double> [[TMP27]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP28]], <3 x i32> <i32 3, i32 1, i32 2>
; CHECK-NEXT:    [[BLOCK20:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> <i32 1>
; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 0
; CHECK-NEXT:    [[SPLAT_SPLATINSERT21:%.*]] = insertelement <1 x double> poison, double [[TMP30]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT22:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT21]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP31:%.*]] = fmul <1 x double> [[BLOCK20]], [[SPLAT_SPLAT22]]
; CHECK-NEXT:    [[BLOCK23:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> <i32 1>
; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 1
; CHECK-NEXT:    [[SPLAT_SPLATINSERT24:%.*]] = insertelement <1 x double> poison, double [[TMP32]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT25:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT24]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP33:%.*]] = fmul <1 x double> [[BLOCK23]], [[SPLAT_SPLAT25]]
; CHECK-NEXT:    [[TMP34:%.*]] = fadd <1 x double> [[TMP31]], [[TMP33]]
; CHECK-NEXT:    [[BLOCK26:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> <i32 1>
; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 2
; CHECK-NEXT:    [[SPLAT_SPLATINSERT27:%.*]] = insertelement <1 x double> poison, double [[TMP35]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT28:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT27]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP36:%.*]] = fmul <1 x double> [[BLOCK26]], [[SPLAT_SPLAT28]]
; CHECK-NEXT:    [[TMP37:%.*]] = fadd <1 x double> [[TMP34]], [[TMP36]]
; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <1 x double> [[TMP37]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <3 x double> [[TMP29]], <3 x double> [[TMP38]], <3 x i32> <i32 0, i32 3, i32 2>
; CHECK-NEXT:    [[BLOCK29:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> <i32 2>
; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 0
; CHECK-NEXT:    [[SPLAT_SPLATINSERT30:%.*]] = insertelement <1 x double> poison, double [[TMP40]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT31:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT30]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP41:%.*]] = fmul <1 x double> [[BLOCK29]], [[SPLAT_SPLAT31]]
; CHECK-NEXT:    [[BLOCK32:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> <i32 2>
; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 1
; CHECK-NEXT:    [[SPLAT_SPLATINSERT33:%.*]] = insertelement <1 x double> poison, double [[TMP42]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT34:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT33]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP43:%.*]] = fmul <1 x double> [[BLOCK32]], [[SPLAT_SPLAT34]]
; CHECK-NEXT:    [[TMP44:%.*]] = fadd <1 x double> [[TMP41]], [[TMP43]]
; CHECK-NEXT:    [[BLOCK35:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> <i32 2>
; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 2
; CHECK-NEXT:    [[SPLAT_SPLATINSERT36:%.*]] = insertelement <1 x double> poison, double [[TMP45]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT37:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT36]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP46:%.*]] = fmul <1 x double> [[BLOCK35]], [[SPLAT_SPLAT37]]
; CHECK-NEXT:    [[TMP47:%.*]] = fadd <1 x double> [[TMP44]], [[TMP46]]
; CHECK-NEXT:    [[TMP48:%.*]] = shufflevector <1 x double> [[TMP47]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
; CHECK-NEXT:    [[TMP49:%.*]] = shufflevector <3 x double> [[TMP39]], <3 x double> [[TMP48]], <3 x i32> <i32 0, i32 1, i32 3>
; CHECK-NEXT:    [[BLOCK38:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 0
; CHECK-NEXT:    [[SPLAT_SPLATINSERT39:%.*]] = insertelement <1 x double> poison, double [[TMP50]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT40:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT39]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP51:%.*]] = fmul <1 x double> [[BLOCK38]], [[SPLAT_SPLAT40]]
; CHECK-NEXT:    [[BLOCK41:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 1
; CHECK-NEXT:    [[SPLAT_SPLATINSERT42:%.*]] = insertelement <1 x double> poison, double [[TMP52]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT43:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT42]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP53:%.*]] = fmul <1 x double> [[BLOCK41]], [[SPLAT_SPLAT43]]
; CHECK-NEXT:    [[TMP54:%.*]] = fadd <1 x double> [[TMP51]], [[TMP53]]
; CHECK-NEXT:    [[BLOCK44:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 2
; CHECK-NEXT:    [[SPLAT_SPLATINSERT45:%.*]] = insertelement <1 x double> poison, double [[TMP55]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT46:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT45]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP56:%.*]] = fmul <1 x double> [[BLOCK44]], [[SPLAT_SPLAT46]]
; CHECK-NEXT:    [[TMP57:%.*]] = fadd <1 x double> [[TMP54]], [[TMP56]]
; CHECK-NEXT:    [[TMP58:%.*]] = shufflevector <1 x double> [[TMP57]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP58]], <3 x i32> <i32 3, i32 1, i32 2>
; CHECK-NEXT:    [[BLOCK47:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> <i32 1>
; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 0
; CHECK-NEXT:    [[SPLAT_SPLATINSERT48:%.*]] = insertelement <1 x double> poison, double [[TMP60]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT49:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT48]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP61:%.*]] = fmul <1 x double> [[BLOCK47]], [[SPLAT_SPLAT49]]
; CHECK-NEXT:    [[BLOCK50:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> <i32 1>
; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 1
; CHECK-NEXT:    [[SPLAT_SPLATINSERT51:%.*]] = insertelement <1 x double> poison, double [[TMP62]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT52:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT51]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP63:%.*]] = fmul <1 x double> [[BLOCK50]], [[SPLAT_SPLAT52]]
; CHECK-NEXT:    [[TMP64:%.*]] = fadd <1 x double> [[TMP61]], [[TMP63]]
; CHECK-NEXT:    [[BLOCK53:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> <i32 1>
; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 2
; CHECK-NEXT:    [[SPLAT_SPLATINSERT54:%.*]] = insertelement <1 x double> poison, double [[TMP65]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT55:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT54]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP66:%.*]] = fmul <1 x double> [[BLOCK53]], [[SPLAT_SPLAT55]]
; CHECK-NEXT:    [[TMP67:%.*]] = fadd <1 x double> [[TMP64]], [[TMP66]]
; CHECK-NEXT:    [[TMP68:%.*]] = shufflevector <1 x double> [[TMP67]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
; CHECK-NEXT:    [[TMP69:%.*]] = shufflevector <3 x double> [[TMP59]], <3 x double> [[TMP68]], <3 x i32> <i32 0, i32 3, i32 2>
; CHECK-NEXT:    [[BLOCK56:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> <i32 2>
; CHECK-NEXT:    [[TMP70:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 0
; CHECK-NEXT:    [[SPLAT_SPLATINSERT57:%.*]] = insertelement <1 x double> poison, double [[TMP70]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT58:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT57]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP71:%.*]] = fmul <1 x double> [[BLOCK56]], [[SPLAT_SPLAT58]]
; CHECK-NEXT:    [[BLOCK59:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> <i32 2>
; CHECK-NEXT:    [[TMP72:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 1
; CHECK-NEXT:    [[SPLAT_SPLATINSERT60:%.*]] = insertelement <1 x double> poison, double [[TMP72]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT61:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT60]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP73:%.*]] = fmul <1 x double> [[BLOCK59]], [[SPLAT_SPLAT61]]
; CHECK-NEXT:    [[TMP74:%.*]] = fadd <1 x double> [[TMP71]], [[TMP73]]
; CHECK-NEXT:    [[BLOCK62:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> <i32 2>
; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 2
; CHECK-NEXT:    [[SPLAT_SPLATINSERT63:%.*]] = insertelement <1 x double> poison, double [[TMP75]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT64:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT63]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP76:%.*]] = fmul <1 x double> [[BLOCK62]], [[SPLAT_SPLAT64]]
; CHECK-NEXT:    [[TMP77:%.*]] = fadd <1 x double> [[TMP74]], [[TMP76]]
; CHECK-NEXT:    [[TMP78:%.*]] = shufflevector <1 x double> [[TMP77]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
; CHECK-NEXT:    [[TMP79:%.*]] = shufflevector <3 x double> [[TMP69]], <3 x double> [[TMP78]], <3 x i32> <i32 0, i32 1, i32 3>
; CHECK-NEXT:    [[BLOCK65:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 0
; CHECK-NEXT:    [[SPLAT_SPLATINSERT66:%.*]] = insertelement <1 x double> poison, double [[TMP80]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT67:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT66]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP81:%.*]] = fmul <1 x double> [[BLOCK65]], [[SPLAT_SPLAT67]]
; CHECK-NEXT:    [[BLOCK68:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 1
; CHECK-NEXT:    [[SPLAT_SPLATINSERT69:%.*]] = insertelement <1 x double> poison, double [[TMP82]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT70:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT69]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP83:%.*]] = fmul <1 x double> [[BLOCK68]], [[SPLAT_SPLAT70]]
; CHECK-NEXT:    [[TMP84:%.*]] = fadd <1 x double> [[TMP81]], [[TMP83]]
; CHECK-NEXT:    [[BLOCK71:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP85:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 2
; CHECK-NEXT:    [[SPLAT_SPLATINSERT72:%.*]] = insertelement <1 x double> poison, double [[TMP85]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT73:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT72]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP86:%.*]] = fmul <1 x double> [[BLOCK71]], [[SPLAT_SPLAT73]]
; CHECK-NEXT:    [[TMP87:%.*]] = fadd <1 x double> [[TMP84]], [[TMP86]]
; CHECK-NEXT:    [[TMP88:%.*]] = shufflevector <1 x double> [[TMP87]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
; CHECK-NEXT:    [[TMP89:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP88]], <3 x i32> <i32 3, i32 1, i32 2>
; CHECK-NEXT:    [[BLOCK74:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> <i32 1>
; CHECK-NEXT:    [[TMP90:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 0
; CHECK-NEXT:    [[SPLAT_SPLATINSERT75:%.*]] = insertelement <1 x double> poison, double [[TMP90]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT76:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT75]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP91:%.*]] = fmul <1 x double> [[BLOCK74]], [[SPLAT_SPLAT76]]
; CHECK-NEXT:    [[BLOCK77:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> <i32 1>
; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 1
; CHECK-NEXT:    [[SPLAT_SPLATINSERT78:%.*]] = insertelement <1 x double> poison, double [[TMP92]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT79:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT78]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP93:%.*]] = fmul <1 x double> [[BLOCK77]], [[SPLAT_SPLAT79]]
; CHECK-NEXT:    [[TMP94:%.*]] = fadd <1 x double> [[TMP91]], [[TMP93]]
; CHECK-NEXT:    [[BLOCK80:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> <i32 1>
; CHECK-NEXT:    [[TMP95:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 2
; CHECK-NEXT:    [[SPLAT_SPLATINSERT81:%.*]] = insertelement <1 x double> poison, double [[TMP95]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT82:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT81]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP96:%.*]] = fmul <1 x double> [[BLOCK80]], [[SPLAT_SPLAT82]]
; CHECK-NEXT:    [[TMP97:%.*]] = fadd <1 x double> [[TMP94]], [[TMP96]]
; CHECK-NEXT:    [[TMP98:%.*]] = shufflevector <1 x double> [[TMP97]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
; CHECK-NEXT:    [[TMP99:%.*]] = shufflevector <3 x double> [[TMP89]], <3 x double> [[TMP98]], <3 x i32> <i32 0, i32 3, i32 2>
; CHECK-NEXT:    [[BLOCK83:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> <i32 2>
; CHECK-NEXT:    [[TMP100:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 0
; CHECK-NEXT:    [[SPLAT_SPLATINSERT84:%.*]] = insertelement <1 x double> poison, double [[TMP100]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT85:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT84]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP101:%.*]] = fmul <1 x double> [[BLOCK83]], [[SPLAT_SPLAT85]]
; CHECK-NEXT:    [[BLOCK86:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> <i32 2>
; CHECK-NEXT:    [[TMP102:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 1
; CHECK-NEXT:    [[SPLAT_SPLATINSERT87:%.*]] = insertelement <1 x double> poison, double [[TMP102]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT88:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT87]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP103:%.*]] = fmul <1 x double> [[BLOCK86]], [[SPLAT_SPLAT88]]
; CHECK-NEXT:    [[TMP104:%.*]] = fadd <1 x double> [[TMP101]], [[TMP103]]
; CHECK-NEXT:    [[BLOCK89:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> <i32 2>
; CHECK-NEXT:    [[TMP105:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 2
; CHECK-NEXT:    [[SPLAT_SPLATINSERT90:%.*]] = insertelement <1 x double> poison, double [[TMP105]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT91:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT90]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP106:%.*]] = fmul <1 x double> [[BLOCK89]], [[SPLAT_SPLAT91]]
; CHECK-NEXT:    [[TMP107:%.*]] = fadd <1 x double> [[TMP104]], [[TMP106]]
; CHECK-NEXT:    [[TMP108:%.*]] = shufflevector <1 x double> [[TMP107]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
; CHECK-NEXT:    [[TMP109:%.*]] = shufflevector <3 x double> [[TMP99]], <3 x double> [[TMP108]], <3 x i32> <i32 0, i32 1, i32 3>
; CHECK-NEXT:    [[TMP110:%.*]] = bitcast <9 x double>* [[C_PTR:%.*]] to double*
; CHECK-NEXT:    [[VEC_CAST92:%.*]] = bitcast double* [[TMP110]] to <3 x double>*
; CHECK-NEXT:    store <3 x double> [[TMP49]], <3 x double>* [[VEC_CAST92]], align 8
; CHECK-NEXT:    [[VEC_GEP93:%.*]] = getelementptr double, double* [[TMP110]], i64 3
; CHECK-NEXT:    [[VEC_CAST94:%.*]] = bitcast double* [[VEC_GEP93]] to <3 x double>*
; CHECK-NEXT:    store <3 x double> [[TMP79]], <3 x double>* [[VEC_CAST94]], align 8
; CHECK-NEXT:    [[VEC_GEP95:%.*]] = getelementptr double, double* [[TMP110]], i64 6
; CHECK-NEXT:    [[VEC_CAST96:%.*]] = bitcast double* [[VEC_GEP95]] to <3 x double>*
; CHECK-NEXT:    store <3 x double> [[TMP109]], <3 x double>* [[VEC_CAST96]], align 8
; CHECK-NEXT:    ret void
;

; Load columns of input matrixes %A and %B.


; Transpose %A.


; Lower multiply(transpose(%A), %B)


; Store result columns.


entry:
  %a = load <9 x double>, <9 x double>* %A.Ptr, align 8
  %b = load <9 x double>, <9 x double>* %B.Ptr, align 8
  %a.trans  = call <9 x double> @llvm.matrix.transpose(<9 x double> %a, i32 3, i32 3)
  %c = call <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double> %a.trans, <9 x double> %b, i32 3, i32 3, i32 3)
  store <9 x double> %c, <9 x double>* %C.Ptr, align 8
  ret void
}

declare <9 x double> @llvm.matrix.transpose(<9 x double>, i32, i32)
declare <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double>, <9 x double>, i32, i32, i32)

define void @transpose_multiply_add(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr, <9 x double>* %C.Ptr) {
; CHECK-LABEL: @transpose_multiply_add(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <9 x double>* [[A_PTR:%.*]] to double*
; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast double* [[TMP0]] to <3 x double>*
; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST]], align 8
; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP0]], i64 3
; CHECK-NEXT:    [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <3 x double>*
; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST1]], align 8
; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, double* [[TMP0]], i64 6
; CHECK-NEXT:    [[VEC_CAST4:%.*]] = bitcast double* [[VEC_GEP3]] to <3 x double>*
; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST4]], align 8
; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <9 x double>* [[B_PTR:%.*]] to double*
; CHECK-NEXT:    [[VEC_CAST6:%.*]] = bitcast double* [[TMP1]] to <3 x double>*
; CHECK-NEXT:    [[COL_LOAD7:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST6]], align 8
; CHECK-NEXT:    [[VEC_GEP8:%.*]] = getelementptr double, double* [[TMP1]], i64 3
; CHECK-NEXT:    [[VEC_CAST9:%.*]] = bitcast double* [[VEC_GEP8]] to <3 x double>*
; CHECK-NEXT:    [[COL_LOAD10:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST9]], align 8
; CHECK-NEXT:    [[VEC_GEP11:%.*]] = getelementptr double, double* [[TMP1]], i64 6
; CHECK-NEXT:    [[VEC_CAST12:%.*]] = bitcast double* [[VEC_GEP11]] to <3 x double>*
; CHECK-NEXT:    [[COL_LOAD13:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST12]], align 8
; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x double> [[COL_LOAD]], i64 0
; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x double> undef, double [[TMP2]], i64 0
; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <3 x double> [[COL_LOAD2]], i64 0
; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <3 x double> [[TMP3]], double [[TMP4]], i64 1
; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <3 x double> [[COL_LOAD5]], i64 0
; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <3 x double> [[TMP5]], double [[TMP6]], i64 2
; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <3 x double> [[COL_LOAD]], i64 1
; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <3 x double> undef, double [[TMP8]], i64 0
; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <3 x double> [[COL_LOAD2]], i64 1
; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <3 x double> [[TMP9]], double [[TMP10]], i64 1
; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <3 x double> [[COL_LOAD5]], i64 1
; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <3 x double> [[TMP11]], double [[TMP12]], i64 2
; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <3 x double> [[COL_LOAD]], i64 2
; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <3 x double> undef, double [[TMP14]], i64 0
; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <3 x double> [[COL_LOAD2]], i64 2
; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <3 x double> [[TMP15]], double [[TMP16]], i64 1
; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <3 x double> [[COL_LOAD5]], i64 2
; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <3 x double> [[TMP17]], double [[TMP18]], i64 2
; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 0
; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP20]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP21:%.*]] = fmul <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
; CHECK-NEXT:    [[BLOCK14:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 1
; CHECK-NEXT:    [[SPLAT_SPLATINSERT15:%.*]] = insertelement <1 x double> poison, double [[TMP22]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT16:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT15]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP23:%.*]] = fmul <1 x double> [[BLOCK14]], [[SPLAT_SPLAT16]]
; CHECK-NEXT:    [[TMP24:%.*]] = fadd <1 x double> [[TMP21]], [[TMP23]]
; CHECK-NEXT:    [[BLOCK17:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 2
; CHECK-NEXT:    [[SPLAT_SPLATINSERT18:%.*]] = insertelement <1 x double> poison, double [[TMP25]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT19:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT18]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP26:%.*]] = fmul <1 x double> [[BLOCK17]], [[SPLAT_SPLAT19]]
; CHECK-NEXT:    [[TMP27:%.*]] = fadd <1 x double> [[TMP24]], [[TMP26]]
; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <1 x double> [[TMP27]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP28]], <3 x i32> <i32 3, i32 1, i32 2>
; CHECK-NEXT:    [[BLOCK20:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> <i32 1>
; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 0
; CHECK-NEXT:    [[SPLAT_SPLATINSERT21:%.*]] = insertelement <1 x double> poison, double [[TMP30]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT22:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT21]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP31:%.*]] = fmul <1 x double> [[BLOCK20]], [[SPLAT_SPLAT22]]
; CHECK-NEXT:    [[BLOCK23:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> <i32 1>
; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 1
; CHECK-NEXT:    [[SPLAT_SPLATINSERT24:%.*]] = insertelement <1 x double> poison, double [[TMP32]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT25:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT24]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP33:%.*]] = fmul <1 x double> [[BLOCK23]], [[SPLAT_SPLAT25]]
; CHECK-NEXT:    [[TMP34:%.*]] = fadd <1 x double> [[TMP31]], [[TMP33]]
; CHECK-NEXT:    [[BLOCK26:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> <i32 1>
; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 2
; CHECK-NEXT:    [[SPLAT_SPLATINSERT27:%.*]] = insertelement <1 x double> poison, double [[TMP35]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT28:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT27]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP36:%.*]] = fmul <1 x double> [[BLOCK26]], [[SPLAT_SPLAT28]]
; CHECK-NEXT:    [[TMP37:%.*]] = fadd <1 x double> [[TMP34]], [[TMP36]]
; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <1 x double> [[TMP37]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <3 x double> [[TMP29]], <3 x double> [[TMP38]], <3 x i32> <i32 0, i32 3, i32 2>
; CHECK-NEXT:    [[BLOCK29:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> <i32 2>
; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 0
; CHECK-NEXT:    [[SPLAT_SPLATINSERT30:%.*]] = insertelement <1 x double> poison, double [[TMP40]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT31:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT30]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP41:%.*]] = fmul <1 x double> [[BLOCK29]], [[SPLAT_SPLAT31]]
; CHECK-NEXT:    [[BLOCK32:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> <i32 2>
; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 1
; CHECK-NEXT:    [[SPLAT_SPLATINSERT33:%.*]] = insertelement <1 x double> poison, double [[TMP42]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT34:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT33]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP43:%.*]] = fmul <1 x double> [[BLOCK32]], [[SPLAT_SPLAT34]]
; CHECK-NEXT:    [[TMP44:%.*]] = fadd <1 x double> [[TMP41]], [[TMP43]]
; CHECK-NEXT:    [[BLOCK35:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> <i32 2>
; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 2
; CHECK-NEXT:    [[SPLAT_SPLATINSERT36:%.*]] = insertelement <1 x double> poison, double [[TMP45]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT37:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT36]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP46:%.*]] = fmul <1 x double> [[BLOCK35]], [[SPLAT_SPLAT37]]
; CHECK-NEXT:    [[TMP47:%.*]] = fadd <1 x double> [[TMP44]], [[TMP46]]
; CHECK-NEXT:    [[TMP48:%.*]] = shufflevector <1 x double> [[TMP47]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
; CHECK-NEXT:    [[TMP49:%.*]] = shufflevector <3 x double> [[TMP39]], <3 x double> [[TMP48]], <3 x i32> <i32 0, i32 1, i32 3>
; CHECK-NEXT:    [[BLOCK38:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 0
; CHECK-NEXT:    [[SPLAT_SPLATINSERT39:%.*]] = insertelement <1 x double> poison, double [[TMP50]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT40:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT39]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP51:%.*]] = fmul <1 x double> [[BLOCK38]], [[SPLAT_SPLAT40]]
; CHECK-NEXT:    [[BLOCK41:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 1
; CHECK-NEXT:    [[SPLAT_SPLATINSERT42:%.*]] = insertelement <1 x double> poison, double [[TMP52]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT43:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT42]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP53:%.*]] = fmul <1 x double> [[BLOCK41]], [[SPLAT_SPLAT43]]
; CHECK-NEXT:    [[TMP54:%.*]] = fadd <1 x double> [[TMP51]], [[TMP53]]
; CHECK-NEXT:    [[BLOCK44:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 2
; CHECK-NEXT:    [[SPLAT_SPLATINSERT45:%.*]] = insertelement <1 x double> poison, double [[TMP55]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT46:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT45]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP56:%.*]] = fmul <1 x double> [[BLOCK44]], [[SPLAT_SPLAT46]]
; CHECK-NEXT:    [[TMP57:%.*]] = fadd <1 x double> [[TMP54]], [[TMP56]]
; CHECK-NEXT:    [[TMP58:%.*]] = shufflevector <1 x double> [[TMP57]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP58]], <3 x i32> <i32 3, i32 1, i32 2>
; CHECK-NEXT:    [[BLOCK47:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> <i32 1>
; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 0
; CHECK-NEXT:    [[SPLAT_SPLATINSERT48:%.*]] = insertelement <1 x double> poison, double [[TMP60]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT49:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT48]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP61:%.*]] = fmul <1 x double> [[BLOCK47]], [[SPLAT_SPLAT49]]
; CHECK-NEXT:    [[BLOCK50:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> <i32 1>
; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 1
; CHECK-NEXT:    [[SPLAT_SPLATINSERT51:%.*]] = insertelement <1 x double> poison, double [[TMP62]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT52:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT51]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP63:%.*]] = fmul <1 x double> [[BLOCK50]], [[SPLAT_SPLAT52]]
; CHECK-NEXT:    [[TMP64:%.*]] = fadd <1 x double> [[TMP61]], [[TMP63]]
; CHECK-NEXT:    [[BLOCK53:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> <i32 1>
; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 2
; CHECK-NEXT:    [[SPLAT_SPLATINSERT54:%.*]] = insertelement <1 x double> poison, double [[TMP65]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT55:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT54]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP66:%.*]] = fmul <1 x double> [[BLOCK53]], [[SPLAT_SPLAT55]]
; CHECK-NEXT:    [[TMP67:%.*]] = fadd <1 x double> [[TMP64]], [[TMP66]]
; CHECK-NEXT:    [[TMP68:%.*]] = shufflevector <1 x double> [[TMP67]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
; CHECK-NEXT:    [[TMP69:%.*]] = shufflevector <3 x double> [[TMP59]], <3 x double> [[TMP68]], <3 x i32> <i32 0, i32 3, i32 2>
; CHECK-NEXT:    [[BLOCK56:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> <i32 2>
; CHECK-NEXT:    [[TMP70:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 0
; CHECK-NEXT:    [[SPLAT_SPLATINSERT57:%.*]] = insertelement <1 x double> poison, double [[TMP70]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT58:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT57]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP71:%.*]] = fmul <1 x double> [[BLOCK56]], [[SPLAT_SPLAT58]]
; CHECK-NEXT:    [[BLOCK59:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> <i32 2>
; CHECK-NEXT:    [[TMP72:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 1
; CHECK-NEXT:    [[SPLAT_SPLATINSERT60:%.*]] = insertelement <1 x double> poison, double [[TMP72]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT61:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT60]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP73:%.*]] = fmul <1 x double> [[BLOCK59]], [[SPLAT_SPLAT61]]
; CHECK-NEXT:    [[TMP74:%.*]] = fadd <1 x double> [[TMP71]], [[TMP73]]
; CHECK-NEXT:    [[BLOCK62:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> <i32 2>
; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 2
; CHECK-NEXT:    [[SPLAT_SPLATINSERT63:%.*]] = insertelement <1 x double> poison, double [[TMP75]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT64:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT63]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP76:%.*]] = fmul <1 x double> [[BLOCK62]], [[SPLAT_SPLAT64]]
; CHECK-NEXT:    [[TMP77:%.*]] = fadd <1 x double> [[TMP74]], [[TMP76]]
; CHECK-NEXT:    [[TMP78:%.*]] = shufflevector <1 x double> [[TMP77]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
; CHECK-NEXT:    [[TMP79:%.*]] = shufflevector <3 x double> [[TMP69]], <3 x double> [[TMP78]], <3 x i32> <i32 0, i32 1, i32 3>
; CHECK-NEXT:    [[BLOCK65:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 0
; CHECK-NEXT:    [[SPLAT_SPLATINSERT66:%.*]] = insertelement <1 x double> poison, double [[TMP80]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT67:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT66]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP81:%.*]] = fmul <1 x double> [[BLOCK65]], [[SPLAT_SPLAT67]]
; CHECK-NEXT:    [[BLOCK68:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 1
; CHECK-NEXT:    [[SPLAT_SPLATINSERT69:%.*]] = insertelement <1 x double> poison, double [[TMP82]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT70:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT69]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP83:%.*]] = fmul <1 x double> [[BLOCK68]], [[SPLAT_SPLAT70]]
; CHECK-NEXT:    [[TMP84:%.*]] = fadd <1 x double> [[TMP81]], [[TMP83]]
; CHECK-NEXT:    [[BLOCK71:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP85:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 2
; CHECK-NEXT:    [[SPLAT_SPLATINSERT72:%.*]] = insertelement <1 x double> poison, double [[TMP85]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT73:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT72]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP86:%.*]] = fmul <1 x double> [[BLOCK71]], [[SPLAT_SPLAT73]]
; CHECK-NEXT:    [[TMP87:%.*]] = fadd <1 x double> [[TMP84]], [[TMP86]]
; CHECK-NEXT:    [[TMP88:%.*]] = shufflevector <1 x double> [[TMP87]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
; CHECK-NEXT:    [[TMP89:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP88]], <3 x i32> <i32 3, i32 1, i32 2>
; CHECK-NEXT:    [[BLOCK74:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> <i32 1>
; CHECK-NEXT:    [[TMP90:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 0
; CHECK-NEXT:    [[SPLAT_SPLATINSERT75:%.*]] = insertelement <1 x double> poison, double [[TMP90]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT76:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT75]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP91:%.*]] = fmul <1 x double> [[BLOCK74]], [[SPLAT_SPLAT76]]
; CHECK-NEXT:    [[BLOCK77:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> <i32 1>
; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 1
; CHECK-NEXT:    [[SPLAT_SPLATINSERT78:%.*]] = insertelement <1 x double> poison, double [[TMP92]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT79:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT78]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP93:%.*]] = fmul <1 x double> [[BLOCK77]], [[SPLAT_SPLAT79]]
; CHECK-NEXT:    [[TMP94:%.*]] = fadd <1 x double> [[TMP91]], [[TMP93]]
; CHECK-NEXT:    [[BLOCK80:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> <i32 1>
; CHECK-NEXT:    [[TMP95:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 2
; CHECK-NEXT:    [[SPLAT_SPLATINSERT81:%.*]] = insertelement <1 x double> poison, double [[TMP95]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT82:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT81]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP96:%.*]] = fmul <1 x double> [[BLOCK80]], [[SPLAT_SPLAT82]]
; CHECK-NEXT:    [[TMP97:%.*]] = fadd <1 x double> [[TMP94]], [[TMP96]]
; CHECK-NEXT:    [[TMP98:%.*]] = shufflevector <1 x double> [[TMP97]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
; CHECK-NEXT:    [[TMP99:%.*]] = shufflevector <3 x double> [[TMP89]], <3 x double> [[TMP98]], <3 x i32> <i32 0, i32 3, i32 2>
; CHECK-NEXT:    [[BLOCK83:%.*]] = shufflevector <3 x double> [[TMP7]], <3 x double> poison, <1 x i32> <i32 2>
; CHECK-NEXT:    [[TMP100:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 0
; CHECK-NEXT:    [[SPLAT_SPLATINSERT84:%.*]] = insertelement <1 x double> poison, double [[TMP100]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT85:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT84]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP101:%.*]] = fmul <1 x double> [[BLOCK83]], [[SPLAT_SPLAT85]]
; CHECK-NEXT:    [[BLOCK86:%.*]] = shufflevector <3 x double> [[TMP13]], <3 x double> poison, <1 x i32> <i32 2>
; CHECK-NEXT:    [[TMP102:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 1
; CHECK-NEXT:    [[SPLAT_SPLATINSERT87:%.*]] = insertelement <1 x double> poison, double [[TMP102]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT88:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT87]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP103:%.*]] = fmul <1 x double> [[BLOCK86]], [[SPLAT_SPLAT88]]
; CHECK-NEXT:    [[TMP104:%.*]] = fadd <1 x double> [[TMP101]], [[TMP103]]
; CHECK-NEXT:    [[BLOCK89:%.*]] = shufflevector <3 x double> [[TMP19]], <3 x double> poison, <1 x i32> <i32 2>
; CHECK-NEXT:    [[TMP105:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 2
; CHECK-NEXT:    [[SPLAT_SPLATINSERT90:%.*]] = insertelement <1 x double> poison, double [[TMP105]], i32 0
; CHECK-NEXT:    [[SPLAT_SPLAT91:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT90]], <1 x double> poison, <1 x i32> zeroinitializer
; CHECK-NEXT:    [[TMP106:%.*]] = fmul <1 x double> [[BLOCK89]], [[SPLAT_SPLAT91]]
; CHECK-NEXT:    [[TMP107:%.*]] = fadd <1 x double> [[TMP104]], [[TMP106]]
; CHECK-NEXT:    [[TMP108:%.*]] = shufflevector <1 x double> [[TMP107]], <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
; CHECK-NEXT:    [[TMP109:%.*]] = shufflevector <3 x double> [[TMP99]], <3 x double> [[TMP108]], <3 x i32> <i32 0, i32 1, i32 3>
; CHECK-NEXT:    [[TMP110:%.*]] = bitcast <9 x double>* [[C_PTR:%.*]] to double*
; CHECK-NEXT:    [[VEC_CAST92:%.*]] = bitcast double* [[TMP110]] to <3 x double>*
; CHECK-NEXT:    [[COL_LOAD93:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST92]], align 8
; CHECK-NEXT:    [[VEC_GEP94:%.*]] = getelementptr double, double* [[TMP110]], i64 3
; CHECK-NEXT:    [[VEC_CAST95:%.*]] = bitcast double* [[VEC_GEP94]] to <3 x double>*
; CHECK-NEXT:    [[COL_LOAD96:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST95]], align 8
; CHECK-NEXT:    [[VEC_GEP97:%.*]] = getelementptr double, double* [[TMP110]], i64 6
; CHECK-NEXT:    [[VEC_CAST98:%.*]] = bitcast double* [[VEC_GEP97]] to <3 x double>*
; CHECK-NEXT:    [[COL_LOAD99:%.*]] = load <3 x double>, <3 x double>* [[VEC_CAST98]], align 8
; CHECK-NEXT:    [[TMP111:%.*]] = fadd <3 x double> [[COL_LOAD93]], [[TMP49]]
; CHECK-NEXT:    [[TMP112:%.*]] = fadd <3 x double> [[COL_LOAD96]], [[TMP79]]
; CHECK-NEXT:    [[TMP113:%.*]] = fadd <3 x double> [[COL_LOAD99]], [[TMP109]]
; CHECK-NEXT:    [[TMP114:%.*]] = bitcast <9 x double>* [[C_PTR]] to double*
; CHECK-NEXT:    [[VEC_CAST100:%.*]] = bitcast double* [[TMP114]] to <3 x double>*
; CHECK-NEXT:    store <3 x double> [[TMP111]], <3 x double>* [[VEC_CAST100]], align 8
; CHECK-NEXT:    [[VEC_GEP101:%.*]] = getelementptr double, double* [[TMP114]], i64 3
; CHECK-NEXT:    [[VEC_CAST102:%.*]] = bitcast double* [[VEC_GEP101]] to <3 x double>*
; CHECK-NEXT:    store <3 x double> [[TMP112]], <3 x double>* [[VEC_CAST102]], align 8
; CHECK-NEXT:    [[VEC_GEP103:%.*]] = getelementptr double, double* [[TMP114]], i64 6
; CHECK-NEXT:    [[VEC_CAST104:%.*]] = bitcast double* [[VEC_GEP103]] to <3 x double>*
; CHECK-NEXT:    store <3 x double> [[TMP113]], <3 x double>* [[VEC_CAST104]], align 8
; CHECK-NEXT:    ret void
;


; Transpose %A.


; Lower multiply(transpose(%A), %B)


;  Embed result of multiply into flat vector.


; Load %C.


; Add column vectors.


; Store result columns.


entry:
  %a = load <9 x double>, <9 x double>* %A.Ptr, align 8
  %b = load <9 x double>, <9 x double>* %B.Ptr, align 8
  %a.trans  = call <9 x double> @llvm.matrix.transpose(<9 x double> %a, i32 3, i32 3)
  %mult = call <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double> %a.trans, <9 x double> %b, i32 3, i32 3, i32 3)
  %c = load <9 x double>, <9 x double>* %C.Ptr, align 8
  %res = fadd <9 x double> %c, %mult

  store <9 x double> %res, <9 x double>* %C.Ptr, align 8
  ret void
}
